-
Notifications
You must be signed in to change notification settings - Fork 764
migration for inactive user purge #6676
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,16 +7,15 @@ | |
from django.utils import timezone | ||
from django.contrib.auth import get_user_model | ||
|
||
from kitsune.users.utils import delete_user_pipeline | ||
|
||
|
||
def delete_inactive_users(apps, schema_editor): | ||
""" | ||
Delete users who haven't logged in for over three years. | ||
""" | ||
User = get_user_model() | ||
|
||
utils_module = importlib.import_module('kitsune.users.utils') | ||
delete_user_pipeline = utils_module.delete_user_pipeline | ||
|
||
has_content_criteria = ( | ||
Q(answer_votes__isnull=False) | ||
| Q(answers__isnull=False) | ||
|
@@ -43,7 +42,7 @@ def delete_inactive_users(apps, schema_editor): | |
|
||
cutoff_date = timezone.now() - timedelta(days=3*365) | ||
|
||
query = User.objects.filter(last_login__lt=cutoff_date) | ||
query = User.objects.filter(last_login__lt=cutoff_date).annotate(has_content=has_content_criteria) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why are you using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This use case absolutely works - I think it's just not specifically called out in the 4.2 docs but is in later docs. I can run this query and see the output and it builds valid sql which executes properly. I'm annotating here so we can later divide our users into those with and those without content, so we can execute a quicker delete process on non-content users. |
||
total_users = query.count() | ||
|
||
if total_users == 0: | ||
|
@@ -52,61 +51,61 @@ def delete_inactive_users(apps, schema_editor): | |
|
||
print(f"Starting deletion of {total_users:,} inactive users") | ||
start_time = time.time() | ||
deleted_count = 0 | ||
batch_size = 1000 | ||
processed_count = 0 | ||
pipeline_count = 0 | ||
direct_count = 0 | ||
|
||
last_id = 0 | ||
while True: | ||
batch = list( | ||
query.filter(id__gt=last_id) | ||
.order_by('id') | ||
.annotate(has_content=has_content_criteria) | ||
[:batch_size] | ||
) | ||
if not batch: | ||
break | ||
last_id = max(user.id for user in batch) | ||
|
||
users_with_content = [user for user in batch if user.has_content] | ||
users_no_content = [user for user in batch if not user.has_content] | ||
for user in users_with_content: | ||
# Batch collect users without content for bulk deletion | ||
users_no_content_batch = [] | ||
batch_size = 1000 | ||
|
||
def bulk_delete_no_content_users(): | ||
nonlocal direct_count | ||
if users_no_content_batch: | ||
try: | ||
User._base_manager.filter(id__in=users_no_content_batch).delete() | ||
direct_count += len(users_no_content_batch) | ||
users_no_content_batch.clear() | ||
except Exception as e: | ||
print(f"Error batch deleting users: {e}") | ||
|
||
for user in query.iterator(chunk_size=1000): | ||
if user.has_content: | ||
try: | ||
delete_user_pipeline(user) | ||
pipeline_count += 1 | ||
except Exception as e: | ||
print(f"Error deleting user {user.id}: {e}") | ||
else: | ||
users_no_content_batch.append(user.id) | ||
|
||
# Bulk delete when batch is full | ||
if len(users_no_content_batch) >= batch_size: | ||
bulk_delete_no_content_users() | ||
|
||
if users_no_content: | ||
try: | ||
user_ids = [user.id for user in users_no_content] | ||
User._base_manager.filter(id__in=user_ids).delete() | ||
direct_count += len(users_no_content) | ||
except Exception as e: | ||
print(f"Error batch deleting users: {e}") | ||
|
||
# Update progress reporting | ||
batch_count = len(batch) | ||
deleted_count += batch_count | ||
processed_count += 1 | ||
|
||
elapsed_time = time.time() - start_time | ||
progress_pct = (deleted_count/total_users*100) if total_users > 0 else 0 | ||
avg_time = elapsed_time / deleted_count if deleted_count > 0 else 0 | ||
remaining_time = (total_users - deleted_count) * avg_time if deleted_count > 0 else 0 | ||
current_rate = deleted_count / elapsed_time * 60 if elapsed_time > 0 else 0 | ||
|
||
print( | ||
f"Progress: {deleted_count:,}/{total_users:,} ({progress_pct:.1f}%) | " | ||
f"Pipeline: {pipeline_count:,} | Direct: {direct_count:,} | " | ||
f"Rate: {current_rate:.1f} users/min | " | ||
f"Remaining: {timedelta(seconds=int(remaining_time))}" | ||
) | ||
# Progress reporting every 1000 users | ||
if processed_count % 1000 == 0: | ||
elapsed_time = time.time() - start_time | ||
progress_pct = (processed_count/total_users*100) if total_users > 0 else 0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit. Since you've already checked if |
||
avg_time = elapsed_time / processed_count if processed_count > 0 else 0 | ||
remaining_time = (total_users - processed_count) * avg_time if processed_count > 0 else 0 | ||
current_rate = processed_count / elapsed_time * 60 if elapsed_time > 0 else 0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit. Same here. I think for all of these lines, the |
||
|
||
print( | ||
f"Progress: {processed_count:,}/{total_users:,} ({progress_pct:.1f}%) | " | ||
f"Pipeline: {pipeline_count:,} | Direct: {direct_count:,} | " | ||
f"Rate: {current_rate:.1f} users/min | " | ||
f"Remaining: {timedelta(seconds=int(remaining_time))}" | ||
) | ||
|
||
# Delete any remaining users without content | ||
bulk_delete_no_content_users() | ||
|
||
total_time = time.time() - start_time | ||
print( | ||
f"Deletion Complete: {deleted_count:,} users deleted " | ||
f"Deletion Complete: {processed_count:,} users deleted " | ||
f"({pipeline_count:,} pipeline, {direct_count:,} direct) " | ||
f"in {timedelta(seconds=int(total_time))}" | ||
) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This needs to be
User = apps.get_model("auth", "User")
in migrations similar to the previous one (0034)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is necessary because the
delete_user_pipeline
function needs an actual User instance vs. a historical model.